library(here)
library(cowplot)
source(here("utils/data_processing.R"))
source(here("utils/figures.R"))

Import data

df_gpt3 <- read_csv(here("data/processed_diagnoses/diagnoses_gpt3.csv.gz"))
df_gpt4 <- read_csv(here("data/processed_diagnoses/diagnoses_gpt4.csv.gz"))
df_gpt4_icd <- read_csv(here("data/processed_diagnoses/diagnoses_gpt4_icd.csv.gz"))

Rank abundance

rank_abundance_plot(df_gpt3)+ggtitle("GPT3")

rank_abundance_plot(df_gpt4)+ggtitle("GPT4")

rank_abundance_plot(df_gpt4_icd)+ggtitle("GPT4 ICD")

Top diagnoses

n_diag <- 25
top_diagnosis_plot(df_gpt3, n_diag = n_diag)+ggtitle("GPT3")

top_diagnosis_plot(df_gpt4, n_diag = n_diag)+ggtitle("GPT4")

top_diagnosis_plot(df_gpt4_icd, n_diag = n_diag)+ggtitle("GPT4 ICD")

Cumulative top frequency

cumulative_frequency_plot(df_gpt3)+ggtitle("GPT3")

cumulative_frequency_plot(df_gpt4)+ggtitle("GPT4")

cumulative_frequency_plot(df_gpt4_icd)+ggtitle("GPT4 ICD")

Diagnosis rank list

diagnosis_rank_table(df_gpt3, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60))
diagnosis_rank_table(df_gpt4, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) 
diagnosis_rank_table(df_gpt4_icd, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) 
diagnosis_rank_table(df_gpt4, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) %>% 
  select(contains(c("diagnosis","mcas"))) %>% 
  head(6) %>% 
  mutate(diagnosis = str_to_sentence(diagnosis)) %>% 
  rename(Diagnosis = diagnosis, `MCAS consortium` = mcas_consortium, `MCAS alternative` = mcas_alternative) %>% 
  flextable() %>% 
  width(width = 2) %>% 
  align(j = 2:3, align = "center", part = "all")

Diagnosis

MCAS consortium

MCAS alternative

Anaphylaxis

1

84

Mastocytosis

14

107

Mast cell activation syndrome

24

65

Systemic mastocytosis

27

155

Mast cell disorders

72

942

Idiopathic anaphylaxis

122

937

NA

Diversity

set.seed(1234)
df_diversity_ci <- df_gpt4 %>% 
  count(criteria, diagnosis, sort = T) %>% 
  group_by(criteria) %>% 
  nest() %>% 
  mutate(boot = map(data, function(x){
    x <- deframe(x)
    x <- entropart::EntropyCI(entropart::Shannon, 
                              Simulations=500, 
                              Ns = x, q=1, 
                              Correction = "None")
    x
  })) %>% 
  mutate(shannon = map_dbl(data, 
                           ~log(entropart::Diversity(deframe(.), q=1, Correction="None")))) %>% # Calculate shannon with entropart
  mutate(boot = map(boot, ~quantile(., c(0.025,0.5,0.975)))) %>% 
  unnest_wider(boot)
================================================================================
================================================================================
================================================================================
================================================================================
================================================================================
================================================================================
Warning: There were 9006 warnings in `mutate()`.
The first warning was:
ℹ In argument: `boot = map(...)`.
ℹ In group 1: `criteria = "aha_kawasaki"`.
Caused by warning in `CheckentropartArguments()`:
! Function arguments cannot be checked because the entropart package is not attached. Add CheckArguments=FALSE to suppress this warning or run library('entropart')
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 9005 remaining warnings.Warning: There were 6 warnings in `mutate()`.
The first warning was:
ℹ In argument: `shannon = map_dbl(data, ~log(entropart::Diversity(deframe(.), q
  = 1, Correction = "None")))`.
ℹ In group 1: `criteria = "aha_kawasaki"`.
Caused by warning in `CheckentropartArguments()`:
! Function arguments cannot be checked because the entropart package is not attached. Add CheckArguments=FALSE to suppress this warning or run library('entropart')
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 5 remaining warnings.
df_diversity_ci

df_diversity_ci %>% 
  format_criteria() %>% 
  ggplot(aes(x = criteria, y = shannon))+
  geom_point(size = 1)+
  geom_errorbar(aes(ymin = `2.5%`, ymax = `97.5%`), width = 0.3)+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "", y = "Shannon diversity") 

Permutation testing

# Run outside of notebook
# Includes all 10,000 GPT iterations and 1000 bootstrap permutations 
# Contained in script source(here("scripts/diversity_analysis/permute_null_diversity_difference.R"))
library(here)
source(here("utils/data_processing.R"))
library(tidyverse)
library(vegan)
library(broom)
library(here)
library(furrr)
library(future.apply)

model <- "gpt4"
p <- 1000
i <- 10000

print("### Reading data")
read_path <- sprintf("data/processed_diagnoses/diagnoses_%s.csv.gz", model)
df <- read_csv(here(read_path))

print("### Calculating permutation")
perm_out <- difference_permutation_test(df, metric = "precision", permutations = p, gpt_iterations = i)

print("### Writing data")
write_path <- sprintf("data/diversity_analysis/precision_permutation_test_%s_p%s_i%s.RDS", model, p, i)
saveRDS(perm_out, here(write_path))
diversity_permutation_results <- readRDS(here("data/diversity_analysis/diversity_permutation_test_gpt4.RDS")) 
diversity_permutation_results
permutation_test_plot(diversity_permutation_results)

Similarity

Jaccard/Bray curtis

diagnosis_similarity_heatmap(df_gpt3, method = "bray")

diagnosis_similarity_heatmap(df_gpt4, method = "bray")

diagnosis_similarity_heatmap(df_gpt3)

diagnosis_similarity_heatmap(df_gpt4)

  • Bray-Curtis similarity measures the similarity of a given diagnostic criteria’s set of alternative diagnoses along with their frequencies.
  • This demonstrates that SLE criteria results in a very similar set and frequency of diagnoses, while the diagnoses associated with two MCAS criteria are as different from each other as they are from those generated by the criteria of other conditions.

PCA

diagnosis_pca_plot(df_gpt3) + ggtitle("GPT3")

diagnosis_pca_plot(df_gpt4) + ggtitle("GPT4")

Precision

  • Precision represents how similar each iteration of a 10-point differential diagnosis is with all other differential diagnoses from the same set of criteria.
  • I.e. how reproducible the 10-point differential diagnosis is for each criteria
  • Measured by obtaining the Bray-Curtis similarity values between all iterations within a criteria
# Script for calculating all Bray-Curtis similarity values within a criteria
# Found in source(here("scripts/diversity_analysis/calculate_precision.R"))
library(here)
source(here("utils/data_processing.R"))

models <- c("gpt3", "gpt4", "gpt4_icd")

for (m in models){
  print(sprintf("READING IN DATA FOR: %s", m))
  read_path <- sprintf("data/processed_diagnoses/diagnoses_%s.csv.gz", m)
  df <- read_csv(here(read_path))
  
  print(sprintf("CALCULATING PRECISION FOR: %s", m))
  df <- calculate_precision(df)
  
  print(sprintf("WRITING PRECISION DATA FOR: %s", m))
  out_path <- sprintf("data/diversity_analysis/diagnosis_precision_%s.csv.gz", m)
  write_csv(df, here(out_path))
}
df_precision <- vroom::vroom(here("data/diversity_analysis/diagnosis_precision_gpt4.csv.gz"))
Rows: 298535709 Columns: 2── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): criteria
dbl (1): distance
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_precision_summary <- df_precision %>% 
  mutate(distance = 1-distance) %>% # Convert to similarity
  summarise(mean_cl_normal(distance), .by = criteria)
df_precision_summary
calculate_boxplot_stats <- function(df){
  df %>% 
    nest(data = distance, .by = criteria) %>% 
    mutate(box = map(data, function(df){
      x <- boxplot.stats(df$distance)$stats
      x <- sort(x)
      names(x) <- c("min","lower","median","upper","max")
      return(x)
    })) %>% 
    select(-data) %>% 
    unnest_wider(box)
}



manual_box_plot <- function(df){
  df %>% 
    format_criteria() %>% 
    ggplot(aes(
      x = criteria,
      ymin = min,
      lower = lower,
      middle = median,
      upper = upper,
      ymax = max
    ))+
    geom_boxplot(stat = "identity") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    labs(x = "", y = "Bray-Curtis similarity")
}
df_precision_stats <- df_precision %>% 
  mutate(distance = 1-distance) %>% # Convert to similarity
  calculate_boxplot_stats()
df_precision_stats
manual_box_plot(df_precision_stats)

Permutation testing

# Run outside of notebook
# Tried multiple combinations of bootstrap permutations and gpt_iterations
# Because a single bray-curtis matrix of all 10,000 comparisons takes ~10 minutes
# Contained in following script:
# source(here("scripts/diversity_analysis/permute_null_precision_difference.R"))
model <- "gpt4"
p <- 1000
i <- 10000

print("### Reading data")
read_path <- sprintf("data/processed_diagnoses/diagnoses_%s.csv.gz", model)
df <- read_csv(here(read_path))

print("### Calculating permutation")
perm_out <- difference_permutation_test(df, metric = "precision", permutations = p, gpt_iterations = i)

print("### Writing data")
write_path <- sprintf("data/diversity_analysis/precision_permutation_test_%s_p%s_i%s.RDS", model, p, i)
saveRDS(perm_out, here())
readRDS(here("data/diversity_analysis/precision_permutation_test_gpt4_p1000_i10000.RDS"))
readRDS(here("data/diversity_analysis/precision_permutation_test_gpt4_p1000_i10000.RDS")) %>% permutation_test_plot()

iNEXT

inext_plots <- function(inext_obj){
  for (i in 1:3){
    plt <- iNEXT::ggiNEXT(inext_obj, type=i, facet.var="Assemblage", color.var="Assemblage") +
      theme_classic() + 
      scale_color_brewer(palette = "Set1") +
      theme(axis.text.x = element_text(angle = 90))+
      scale_color_brewer(palette = "Dark2")
    print(plt)
  }
}

readRDS(here("data/diversity_analysis/mcas_iNEXT_gpt4_e250000.RDS")) %>% inext_plots()
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

readRDS(here("data/diversity_analysis/mcas_iNEXT_dropSingle_gpt4_e200000.RDS")) %>% inext_plots()
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

readRDS(here("data/diversity_analysis/mcas_iNEXT_dropSingle_psuedoMinus_gpt4_e200000.RDS")) %>% inext_plots()
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Final figures

2_Top_diagnoses

custom_labeler <- function(x, wrap_width=33) {
    x %>%
        str_replace("___.+$", "") %>%
        str_wrap(width = wrap_width)
}

plt_top <- top_diagnosis_plot(df_gpt4, n_diag = 25) + 
  theme(axis.text = element_text(size = 5, lineheight = 0.7), 
        strip.text = element_text(size = 7),
        axis.title = element_text(size = 9)) + 
  tidytext::scale_x_reordered(labels = custom_labeler)
Scale for x is already present.
Adding another scale for x, which will replace the existing scale.
plt_top

ggsave(plot=plt_top,filename=here("figures/3_Top_25_diagnoses.pdf"), width = 7.4, height = 6.5)

3A_Rank_abundance

plt_rank <- rank_abundance_plot(df_gpt4) +theme(legend.position = c(0.7,0.7))
plt_rank

3B_Cumulative_frequency

plt_cumulative <- cumulative_frequency_plot(df_gpt4) +
  labs(y = "Combined frequency\nof top 25 diagnoses")
plt_cumulative

3C_Diversity

plt_div <- df_diversity_ci %>% 
  format_criteria() %>% 
  ggplot(aes(x = criteria, y = shannon))+
  geom_point(size = 1)+
  geom_errorbar(aes(ymin = `2.5%`, ymax = `97.5%`), width = 0.3)+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "", y = "Shannon diversity") 
plt_div

3D_Precision

plt_precision <- manual_box_plot(df_precision_stats)
plt_precision

3_Compiled

plt_fig3 <- plot_grid(
  plot_grid(NULL),
  plot_grid(
    plt_rank,
    NULL,
    plt_cumulative,
    nrow = 1, 
    rel_widths = c(3,0.2,2),
    labels = c("A", "", "B"),
    vjust = 0.2
  ),
  plot_grid(NULL),
  plot_grid(
    plt_div,
    NULL,
    plt_precision,
    nrow = 1,
    rel_widths = c(1,0.1,1),
    labels = c("C", "", "D"),
    vjust = 0.2
  ),
  ncol = 1,
  rel_heights = c(0.05,1,0.05,1)
)
plt_fig3

ggsave(plot=plt_fig3,filename=here("figures/4_Diagnosis_diversity.pdf"), width = 6.5, height = 6)

Table

diagnosis_rank_table(df_gpt4, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) %>% 
  select(contains(c("diagnosis","mcas"))) %>% 
  head(6) %>% 
  mutate(diagnosis = str_to_sentence(diagnosis)) %>% 
  rename(Diagnosis = diagnosis, `MCAS consortium` = mcas_consortium, `MCAS alternative` = mcas_alternative) %>% 
  flextable() %>% 
  width(width = 2) %>% 
  align(j = 2:3, align = "center", part = "all") %>% 
  {print(., preview = "pdf");.}

Diagnosis

MCAS consortium

MCAS alternative

Anaphylaxis

1

84

Mastocytosis

14

107

Mast cell activation syndrome

24

65

Systemic mastocytosis

27

155

Mast cell disorders

72

942

Idiopathic anaphylaxis

122

937

Supplemental figures

plot_grid(
  top_diagnosis_plot(df_gpt3, n_diag = 25) + 
    theme(axis.text = element_text(size = 4.5, lineheight = 0.7), 
          strip.text = element_text(size = 7),
          axis.title = element_text(size = 9)) + 
    tidytext::scale_x_reordered(labels = custom_labeler),
  
  top_diagnosis_plot(df_gpt4_icd, n_diag = 25) + 
    theme(axis.text = element_text(size = 4, lineheight = 0.7), 
          strip.text = element_text(size = 7),
          axis.title = element_text(size = 9)) + 
    tidytext::scale_x_reordered(labels = ~custom_labeler(., wrap_width = 45)),
  ncol = 1,
  rel_heights = c(0.9,1),
  labels = c("A","B")
) %>% 
  {ggsave(plot=.,filename=here("figures/S_Top_diagnoses.pdf"), width = 7, height = 10);.}

plot_grid(
  rank_abundance_plot(df_gpt3) +theme(legend.position = c(0.7,0.7)),
  rank_abundance_plot(df_gpt4_icd) +theme(legend.position = c(0.7,0.7)),
  nrow = 1,
  labels = c("A","B")
) %>% 
  {ggsave(plot=.,filename=here("figures/S_Ranked_abundance.pdf"), width = 7.5, height = 3);.}

---
title: "Diagnosis distribution analysis"
output: 
  html_notebook:
    toc: true
    toc_float: true
---

```{r, message = F}
library(here)
library(cowplot)
source(here("utils/data_processing.R"))
source(here("utils/figures.R"))
```

# Import data
```{r, message=F}
df_gpt3 <- read_csv(here("data/processed_diagnoses/diagnoses_gpt3.csv.gz"))
df_gpt4 <- read_csv(here("data/processed_diagnoses/diagnoses_gpt4.csv.gz"))
df_gpt4_icd <- read_csv(here("data/processed_diagnoses/diagnoses_gpt4_icd.csv.gz"))
```
# Rank abundance
```{r}
rank_abundance_plot(df_gpt3)+ggtitle("GPT3")
rank_abundance_plot(df_gpt4)+ggtitle("GPT4")
rank_abundance_plot(df_gpt4_icd)+ggtitle("GPT4 ICD")
```

# Top diagnoses
```{r, fig.width = 16, fig.height = 8}
n_diag <- 25
top_diagnosis_plot(df_gpt3, n_diag = n_diag)+ggtitle("GPT3")
top_diagnosis_plot(df_gpt4, n_diag = n_diag)+ggtitle("GPT4")
top_diagnosis_plot(df_gpt4_icd, n_diag = n_diag)+ggtitle("GPT4 ICD")
```

# Cumulative top frequency
```{r, fig.width=3, fig.height=3.5}
cumulative_frequency_plot(df_gpt3)+ggtitle("GPT3")
cumulative_frequency_plot(df_gpt4)+ggtitle("GPT4")
cumulative_frequency_plot(df_gpt4_icd)+ggtitle("GPT4 ICD")
```

# Diagnosis rank list
```{r}
diagnosis_rank_table(df_gpt3, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60))
diagnosis_rank_table(df_gpt4, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) 
diagnosis_rank_table(df_gpt4_icd, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) 
```

```{r}
diagnosis_rank_table(df_gpt4, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) %>% 
  select(contains(c("diagnosis","mcas"))) %>% 
  head(6) %>% 
  mutate(diagnosis = str_to_sentence(diagnosis)) %>% 
  rename(Diagnosis = diagnosis, `MCAS consortium` = mcas_consortium, `MCAS alternative` = mcas_alternative) %>% 
  flextable() %>% 
  width(width = 2) %>% 
  align(j = 2:3, align = "center", part = "all")
  
```




# Diversity
```{r, fig.width=4, fig.height=3.5}
set.seed(1234)
df_diversity_ci <- df_gpt4 %>% 
  count(criteria, diagnosis, sort = T) %>% 
  group_by(criteria) %>% 
  nest() %>% 
  mutate(boot = map(data, function(x){
    x <- deframe(x)
    x <- entropart::EntropyCI(entropart::Shannon, 
                              Simulations=500, 
                              Ns = x, q=1, 
                              Correction = "None")
    x
  })) %>% 
  mutate(shannon = map_dbl(data, 
                           ~log(entropart::Diversity(deframe(.), q=1, Correction="None")))) %>% # Calculate shannon with entropart
  mutate(boot = map(boot, ~quantile(., c(0.025,0.5,0.975)))) %>% 
  unnest_wider(boot)

df_diversity_ci

df_diversity_ci %>% 
  format_criteria() %>% 
  ggplot(aes(x = criteria, y = shannon))+
  geom_point(size = 1)+
  geom_errorbar(aes(ymin = `2.5%`, ymax = `97.5%`), width = 0.3)+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "", y = "Shannon diversity") 
```

### Permutation testing

```{r, eval = F}
# Run outside of notebook
# Includes all 10,000 GPT iterations and 1000 bootstrap permutations 
# Contained in script source(here("scripts/diversity_analysis/permute_null_diversity_difference.R"))
library(here)
source(here("utils/data_processing.R"))
library(tidyverse)
library(vegan)
library(broom)
library(here)
library(furrr)
library(future.apply)

model <- "gpt4"
p <- 1000
i <- 10000

print("### Reading data")
read_path <- sprintf("data/processed_diagnoses/diagnoses_%s.csv.gz", model)
df <- read_csv(here(read_path))

print("### Calculating permutation")
perm_out <- difference_permutation_test(df, metric = "precision", permutations = p, gpt_iterations = i)

print("### Writing data")
write_path <- sprintf("data/diversity_analysis/precision_permutation_test_%s_p%s_i%s.RDS", model, p, i)
saveRDS(perm_out, here(write_path))
```

```{r}
diversity_permutation_results <- readRDS(here("data/diversity_analysis/diversity_permutation_test_gpt4.RDS")) 
diversity_permutation_results
```
```{r}
permutation_test_plot(diversity_permutation_results)
```


# Similarity

### Jaccard/Bray curtis
```{r, fig.width=4.25, fig.height=3.5}
diagnosis_similarity_heatmap(df_gpt3, method = "bray")
diagnosis_similarity_heatmap(df_gpt4, method = "bray")
diagnosis_similarity_heatmap(df_gpt3)
diagnosis_similarity_heatmap(df_gpt4)
```
- Bray-Curtis similarity measures the similarity of a given diagnostic criteria’s set of alternative diagnoses along with their frequencies.
- This demonstrates that SLE criteria results in a very similar set and frequency of diagnoses, while the diagnoses associated with two MCAS criteria are as different from each other as they are from those generated by the criteria of other conditions.

### PCA

```{r, fig.width=4.25, fig.height=3.5}
diagnosis_pca_plot(df_gpt3) + ggtitle("GPT3")
diagnosis_pca_plot(df_gpt4) + ggtitle("GPT4")
```
# Precision

- Precision represents how similar each iteration of a 10-point differential diagnosis is with all other differential diagnoses from the same set of criteria. 
- I.e. how reproducible the 10-point differential diagnosis is for each criteria
- Measured by obtaining the Bray-Curtis similarity values between all iterations within a criteria

```{r, eval=F}
# Script for calculating all Bray-Curtis similarity values within a criteria
# Found in source(here("scripts/diversity_analysis/calculate_precision.R"))
library(here)
source(here("utils/data_processing.R"))

models <- c("gpt3", "gpt4", "gpt4_icd")

for (m in models){
  print(sprintf("READING IN DATA FOR: %s", m))
  read_path <- sprintf("data/processed_diagnoses/diagnoses_%s.csv.gz", m)
  df <- read_csv(here(read_path))
  
  print(sprintf("CALCULATING PRECISION FOR: %s", m))
  df <- calculate_precision(df)
  
  print(sprintf("WRITING PRECISION DATA FOR: %s", m))
  out_path <- sprintf("data/diversity_analysis/diagnosis_precision_%s.csv.gz", m)
  write_csv(df, here(out_path))
}
```

```{r}
df_precision <- vroom::vroom(here("data/diversity_analysis/diagnosis_precision_gpt4.csv.gz"))
```

```{r}
df_precision_summary <- df_precision %>% 
  mutate(distance = 1-distance) %>% # Convert to similarity
  summarise(mean_cl_normal(distance), .by = criteria)
df_precision_summary
```

```{r, fig.width = 3, fig.height = 3}
calculate_boxplot_stats <- function(df){
  df %>% 
    nest(data = distance, .by = criteria) %>% 
    mutate(box = map(data, function(df){
      x <- boxplot.stats(df$distance)$stats
      x <- sort(x)
      names(x) <- c("min","lower","median","upper","max")
      return(x)
    })) %>% 
    select(-data) %>% 
    unnest_wider(box)
}



manual_box_plot <- function(df){
  df %>% 
    format_criteria() %>% 
    ggplot(aes(
      x = criteria,
      ymin = min,
      lower = lower,
      middle = median,
      upper = upper,
      ymax = max
    ))+
    geom_boxplot(stat = "identity") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    labs(x = "", y = "Bray-Curtis similarity")
}

```


```{r}
df_precision_stats <- df_precision %>% 
  mutate(distance = 1-distance) %>% # Convert to similarity
  calculate_boxplot_stats()
df_precision_stats
```
```{r, fig.width=3, fig.height=3}
manual_box_plot(df_precision_stats)
```
### Permutation testing

```{r, eval = F}
# Run outside of notebook
# Tried multiple combinations of bootstrap permutations and gpt_iterations
# Because a single bray-curtis matrix of all 10,000 comparisons takes ~10 minutes
# Contained in following script:
# source(here("scripts/diversity_analysis/permute_null_precision_difference.R"))
model <- "gpt4"
p <- 1000
i <- 10000

print("### Reading data")
read_path <- sprintf("data/processed_diagnoses/diagnoses_%s.csv.gz", model)
df <- read_csv(here(read_path))

print("### Calculating permutation")
perm_out <- difference_permutation_test(df, metric = "precision", permutations = p, gpt_iterations = i)

print("### Writing data")
write_path <- sprintf("data/diversity_analysis/precision_permutation_test_%s_p%s_i%s.RDS", model, p, i)
saveRDS(perm_out, here())
```


```{r}
readRDS(here("data/diversity_analysis/precision_permutation_test_gpt4_p1000_i10000.RDS"))
```


```{r}
readRDS(here("data/diversity_analysis/precision_permutation_test_gpt4_p1000_i10000.RDS")) %>% permutation_test_plot()
```


# iNEXT

```{r, fig.width=12, fig.height=4}
inext_plots <- function(inext_obj){
  for (i in 1:3){
    plt <- iNEXT::ggiNEXT(inext_obj, type=i, facet.var="Assemblage", color.var="Assemblage") +
      theme_classic() + 
      scale_color_brewer(palette = "Set1") +
      theme(axis.text.x = element_text(angle = 90))+
      scale_color_brewer(palette = "Dark2")
    print(plt)
  }
}

readRDS(here("data/diversity_analysis/mcas_iNEXT_gpt4_e250000.RDS")) %>% inext_plots()
readRDS(here("data/diversity_analysis/mcas_iNEXT_dropSingle_gpt4_e200000.RDS")) %>% inext_plots()
readRDS(here("data/diversity_analysis/mcas_iNEXT_dropSingle_psuedoMinus_gpt4_e200000.RDS")) %>% inext_plots()
```


# Final figures

### 2_Top_diagnoses
```{r, fig.width=7.4, fig.height=6.5}
custom_labeler <- function(x, wrap_width=33) {
    x %>%
        str_replace("___.+$", "") %>%
        str_wrap(width = wrap_width)
}

plt_top <- top_diagnosis_plot(df_gpt4, n_diag = 25) + 
  theme(axis.text = element_text(size = 5, lineheight = 0.7), 
        strip.text = element_text(size = 7),
        axis.title = element_text(size = 9)) + 
  tidytext::scale_x_reordered(labels = custom_labeler)
plt_top
```

```{r}
ggsave(plot=plt_top,filename=here("figures/3_Top_25_diagnoses.pdf"), width = 7.4, height = 6.5)
```


### 3A_Rank_abundance
```{r, fig.width=3.5, fig.height=2.5}
plt_rank <- rank_abundance_plot(df_gpt4) +theme(legend.position = c(0.7,0.7))
plt_rank
```

### 3B_Cumulative_frequency

```{r, fig.width=2.5, fig.height=2.5}
plt_cumulative <- cumulative_frequency_plot(df_gpt4) +
  labs(y = "Combined frequency\nof top 25 diagnoses")
plt_cumulative
```



### 3C_Diversity
```{r, fig.width=3, fig.height=3}
plt_div <- df_diversity_ci %>% 
  format_criteria() %>% 
  ggplot(aes(x = criteria, y = shannon))+
  geom_point(size = 1)+
  geom_errorbar(aes(ymin = `2.5%`, ymax = `97.5%`), width = 0.3)+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "", y = "Shannon diversity") 
plt_div
```
### 3D_Precision
```{r, fig.width=3, fig.height=3}
plt_precision <- manual_box_plot(df_precision_stats)
plt_precision
```
### 3_Compiled 

```{r, fig.width=6.5, fig.height=6}
plt_fig3 <- plot_grid(
  plot_grid(NULL),
  plot_grid(
    plt_rank,
    NULL,
    plt_cumulative,
    nrow = 1, 
    rel_widths = c(3,0.2,2),
    labels = c("A", "", "B"),
    vjust = 0.2
  ),
  plot_grid(NULL),
  plot_grid(
    plt_div,
    NULL,
    plt_precision,
    nrow = 1,
    rel_widths = c(1,0.1,1),
    labels = c("C", "", "D"),
    vjust = 0.2
  ),
  ncol = 1,
  rel_heights = c(0.05,1,0.05,1)
)
plt_fig3
ggsave(plot=plt_fig3,filename=here("figures/4_Diagnosis_diversity.pdf"), width = 6.5, height = 6)
```

### Table

```{r, message = F, warning=FALSE}
diagnosis_rank_table(df_gpt4, "mast |mastoc|anaphylaxis") %>% mutate(diagnosis = substr(diagnosis, 1, 60)) %>% 
  select(contains(c("diagnosis","mcas"))) %>% 
  head(6) %>% 
  mutate(diagnosis = str_to_sentence(diagnosis)) %>% 
  rename(Diagnosis = diagnosis, `MCAS consortium` = mcas_consortium, `MCAS alternative` = mcas_alternative) %>% 
  flextable() %>% 
  width(width = 2) %>% 
  align(j = 2:3, align = "center", part = "all") %>% 
  {print(., preview = "pdf");.}
```

### Supplemental figures

```{r, fig.width=7, fig.height=10, message = F}
plot_grid(
  top_diagnosis_plot(df_gpt3, n_diag = 25) + 
    theme(axis.text = element_text(size = 4.5, lineheight = 0.7), 
          strip.text = element_text(size = 7),
          axis.title = element_text(size = 9)) + 
    tidytext::scale_x_reordered(labels = custom_labeler),
  
  top_diagnosis_plot(df_gpt4_icd, n_diag = 25) + 
    theme(axis.text = element_text(size = 4, lineheight = 0.7), 
          strip.text = element_text(size = 7),
          axis.title = element_text(size = 9)) + 
    tidytext::scale_x_reordered(labels = ~custom_labeler(., wrap_width = 45)),
  ncol = 1,
  rel_heights = c(0.9,1),
  labels = c("A","B")
) %>% 
  {ggsave(plot=.,filename=here("figures/S_Top_diagnoses.pdf"), width = 7, height = 10);.}
```
```{r, fig.width=7.5, fig.height=3}
plot_grid(
  rank_abundance_plot(df_gpt3) +theme(legend.position = c(0.7,0.7)),
  rank_abundance_plot(df_gpt4_icd) +theme(legend.position = c(0.7,0.7)),
  nrow = 1,
  labels = c("A","B")
) %>% 
  {ggsave(plot=.,filename=here("figures/S_Ranked_abundance.pdf"), width = 7.5, height = 3);.}
```



